\documentclass{article}

\usepackage{pdfpages}
\usepackage[a4paper, total={6in, 8in}]{geometry}
\usepackage{amsfonts, amsmath, amssymb, amsthm}
\usepackage{enumerate}
\usepackage{babel}
\usepackage{listings}
\usepackage{subcaption}
\usepackage[shortlabels]{enumitem}
\usepackage[pagebackref=true,breaklinks=true,colorlinks,bookmarks=false]{hyperref}

\usepackage{pythonhighlight}

\begin{document}
% \begin{equation}
%   p(x) = \int p(y)p(x|y) \mathrm{d}y
% \end{equation}
% \begin{equation}
%   \log p(x) \geq \int p(y|\theta) \log p(x | y) \mathrm{d}y
% \end{equation}
% \begin{equation}
%   p(x) = \int p(y)p(x|y) \mathrm{d}y = \int \left(\int p(z)p(y|z) \mathrm{d}z\right) p(x|y) \mathrm{d}y
% \end{equation}
% \begin{equation}
%   p(x) = \int  p(z) \int p(y|z) p(x|y)\mathrm{d}y \mathrm{d}z
% \end{equation}
% \begin{align}
%   \log p(x)
%   \geq & \int  p(z) \log \int p(y|z) p(x|y)\mathrm{d}y \mathrm{d}z \\
%   \geq & \int  p(z) \int p(y|z) \log p(x|y)\mathrm{d}y \mathrm{d}z \\
% \end{align}
% Let $z \sim \mathrm{Cat}(\theta)$
% \begin{align}
%           & \nabla_\theta \int p(z|\theta) \int p(y|z) \log p(x|y)\mathrm{d}y \mathrm{d}z                                        \\
%   =       & \int p(z|\theta) \nabla_\theta \log p(z|\theta) \int p(y|z) \log p(x|y)\mathrm{d}y \mathrm{d}z                       \\
%   =       & \int p(z|\theta) \int p(y|z) \log p(x|y)\mathrm{d}y \ \nabla_\theta \log p(z|\theta)\mathrm{d}z                      \\
%   \approx & \int p(y|z) \log p(x|y)\mathrm{d}y \ \nabla_\theta \log p(z|\theta)                             & z \sim p(z|\theta) \\
%   \approx & \log p(x|y) \nabla_\theta \log p(z|\theta)                                                      & y \sim p(y|z)
% \end{align}
% Let $y \sim \mathrm{Cat}(\theta)$
% \begin{align}
%           & \nabla_\theta \int p(z) \int p(y|z, \theta) \log p(x|y)\mathrm{d}y \mathrm{d}z                      \\
%   \approx & \int p(y|z, \theta) \log p(x|y) \nabla_\theta \log p(y|z, \theta) \mathrm{d}y  & z \sim p(z)        \\
%   \approx & \int \log p(x|y) \nabla_\theta \log p(y|z, \theta)                             & y \sim p(y|\theta)
% \end{align}
% Let $y \sim \mathrm{Cat}(\theta)$
% \begin{align}
%           & \nabla_\theta \int p(z) \int p(y|z, \theta) \log p(x|y)\mathrm{d}y \mathrm{d}z                      \\
%   \approx & \int p(y|z, \theta) \log p(x|y) \nabla_\theta \log p(y|z, \theta) \mathrm{d}y  & z \sim p(z)        \\
%   \approx & \int \log p(x|y) \nabla_\theta \log p(y|z, \theta)                             & y \sim p(y|\theta)
% \end{align}

% \newpage

\paragraph{Multi-layer NFMM Model Architecture}

\begin{equation}
  p(x_1|i_1) = p(x_0)\left| \det \left(\frac{\partial f(x_0, \theta_1, i_1)}{\partial x_0, \theta_1^{(i)}}\right) \right|^{-1}
\end{equation}

\begin{equation}
  p(x_1) = \int p(i_1)p(x_1|i_1) \mathrm{d}i_1
\end{equation}

\begin{equation}
  p(x_2|i_2) = p(x_1)\left| \det \left(\frac{\partial f(x_1, \theta_2, i_2)}{\partial x_1}\right) \right|^{-1}
\end{equation}

\begin{equation}
  p(x_2) = \int p(i_2) p(x_2|i_2) \mathrm{d}i_2
\end{equation}

\begin{equation*}
  \vdots
\end{equation*}

\paragraph{Single NFMM model optimization}

Combining equations above with $p(i_1) := p(i_1 | \alpha_1)$,
\begin{equation}
  p(x_1) = \int p(i_1|\alpha_1)p(x_0)\left| \det \left(\frac{\partial f(x_0, \theta_1, i_1)}{\partial x_0}\right) \right|^{-1} \mathrm{d}i_1
\end{equation}

\begin{equation}
  \log p(x_1) \geq \int p(i_1|\alpha_1) \log p(x_0)\left| \det \left(\frac{\partial f(x_0, \theta_1, i_1)}{\partial x_0}\right) \right|^{-1} \mathrm{d}i_1 =: \mathcal{L}(\alpha_1; \theta_1)
\end{equation}

Use REINFORCE for optimizing $\alpha_1$,

\begin{align}
  \nabla_{\alpha_1} \mathcal{L}(\alpha_1; \theta_1)
  =       & \int p(i_1|\alpha_1) \log p(x_0)\left| \det \left(\frac{\partial f(x_0, \theta_1, i_1)}{\partial x_0}\right) \right|^{-1} \nabla_{\alpha_1} \log p(i_1|\alpha_1) \mathrm{d}i_1     \\
  \approx & \log p(x_0)\left| \det \left(\frac{\partial f(x_0, \theta_1, i_1)}{\partial x_0}\right) \right|^{-1} \nabla_{\alpha_1} \log p(i_1|\alpha_1) \text{ with } i_1 \sim p(i_1|\alpha_1)
\end{align}

\paragraph{Two-layer NFMM model optimization}

with $p(i_1) := p(i_1 | \alpha_1)$, $p(i_2) := p(i_2 | \alpha_2)$.

We abbreviate
$J_2 := \left| \det \left(\frac{\partial f(x_1, \theta_2, i_2)}{\partial x_1}\right) \right|^{-1}$
and
$J_1 := \left| \det \left(\frac{\partial f(x_0, \theta_1, i_1)}{\partial x_0}\right) \right|^{-1}$
.

\begin{align}
    & p(x_2)                                                                             \\
  = & \int p(i_2|\alpha_2) p(x_1)J_1 \mathrm{d}i_2                                       \\
  = & \int p(i_2|\alpha_2) \int p(i_1|\alpha_1)p(x_0)J_2 \mathrm{d}i_1 J_1 \mathrm{d}i_2 \\
\end{align}

\begin{align}
       & \log p(x_2)                                                                                         \\
  \geq & \int p(i_2|\alpha_2) \log \int p(i_1|\alpha_1)p(x_0)J_2J_1 \mathrm{d}i_1 \mathrm{d}i_2              \\
  \geq & \int p(i_2|\alpha_2) \int p(i_1|\alpha_1) \log\left(p(x_0)J_2J_1\right) \mathrm{d}i_1 \mathrm{d}i_2 \\
  =:   & \mathcal{L}(\alpha_1, \alpha_2; \theta_1, \theta_2)
\end{align}

Use REINFORCE for optimizing $\alpha_1$,
\begin{align*}
  \nabla_{\alpha_1} \mathcal{L}(\alpha_1, \alpha_2; \theta_1, \theta_2)
  \approx & \int \nabla_{\alpha_1} p(i_1|\alpha_1) \log\left(p(x_0)J_2J_1\right) \mathrm{d}i_1 \text{ with } i_2 \sim p(i_2|\alpha_2)              \\
  \approx & \log\left(p(x_0)J_2J_1\right) \nabla_{\alpha_1} \log p(i_1|\alpha_1) \text{ with } i_2 \sim p(i_2|\alpha_2), i_1 \sim p(i_1| \alpha_1) \\
\end{align*}

Use REINFORCE for optimizing $\alpha_2$,
\begin{align*}
  \nabla_{\alpha_2} \mathcal{L}(\alpha_1, \alpha_2; \theta_1, \theta_2)
  =       & \int \nabla_{\alpha_2} p(i_2|\alpha_2) \int p(i_1|\alpha_1) \log\left(p(x_0)J_2J_1\right) \mathrm{d}i_1 \mathrm{d}i_2                          \\
  \approx & \int p(i_1|\alpha_1) \log\left(p(x_0)J_2J_1\right) \mathrm{d}i_1 \nabla_{\alpha_2} \log p(i_2|\alpha_2) \text{ with } i_2 \sim p(i_2|\alpha_2) \\
  \approx & \log\left(p(x_0)J_2J_1\right) \nabla_{\alpha_2} \log p(i_2|\alpha_2)  \text{ with } i_2 \sim p(i_2|\alpha_2), i_1 \sim p(i_1| \alpha_1)        \\
\end{align*}



\end{document}
